# 1.Setup
rm(list=ls())
setwd("/Users/ac/Documents/columbia/study/2015fall/G5999 Master Thesis/Thesis Writing/Data&R")
source("/Users/ac/Documents/columbia/study/2015fall/G5999 Master Thesis/Thesis Writing/Data&R/data functions.R")
LoadLibrary()

# 2.Climate Variables - Sort temp prcp data for each state
setwd("/Users/ac/Documents/columbia/study/2015fall/G5999 Master Thesis/Thesis Writing/Data&R/temp prcp data")

CO <- read.csv("CT CO.csv", header=T)
CO <- subset(CO, CO$STATION_NAME == "ALAMOSA SAN LUIS VALLEY REGIONAL AIRPORT CO US", select = -STATION)
CO[CO == -9999] <- NA
CO$STATION_NAME <- "CO"
  
IL <- read.csv("IL.csv", header=T)
IL <- subset(IL, select = -STATION)
IL[IL == -9999] <- NA
IL$STATION_NAME <- "IL"

IN <- read.csv("IN IA LA.csv", header=T)
IN <- subset(IN, IN$STATION_NAME == "ANDERSON SEWAGE PLANT IN US", select = -STATION)
IN[IN == -9999] <- NA
IN$STATION_NAME <- "IN"

IA <- read.csv("IN IA LA.csv", header=T)
IA <- subset(IA, IA$STATION_NAME == "DES MOINES INTERNATIONAL AIRPORT IA US", select = -STATION)
IA[IA == -9999] <- NA
IA$STATION_NAME <- "IA"

KS <- read.csv("KS.csv", header=T)
KS <- subset(KS, KS$STATION_NAME == "BELLEVILLE KS US", select = -STATION)
KS[KS == -9999] <- NA
KS$STATION_NAME <- "KS"

KY <- read.csv("KY.csv", header=T)
KY <- subset(KY, select = -STATION)
KY[KY == -9999] <- NA
KY$STATION_NAME <- "KY"

MI <- read.csv("MN MO MI NE.csv", header=T)
MI <- subset(MI, MI$STATION_NAME == "BATTLE CREEK 5 NW MI US", select = -STATION)
MI[MI == -9999] <- NA
MI$STATION_NAME <- "MI"

MN <- read.csv("MN MO MI NE.csv", header=T)
MN <- subset(MN, MN$STATION_NAME == "ALEXANDRIA MUNICIPAL AIRPORT MN US", select = -STATION)
MN[MN == -9999] <- NA
MN$STATION_NAME <- "MN"

MO <- read.csv("MN MO MI NE.csv", header=T)
MO <- subset(MO, MO$STATION_NAME == "WAYNESVILLE 2 W MO US", select = -STATION)
MO[MO == -9999] <- NA
MO$STATION_NAME <- "MO"

NE <- read.csv("MN MO MI NE.csv", header=T)
NE <- subset(NE, NE$STATION_NAME == "AINSWORTH NE US", select = -STATION)
NE[NE == -9999] <- NA
NE$STATION_NAME <- "NE"

NC <- read.csv("OH NC ND.csv", header=T)
NC <- subset(NC, NC$STATION_NAME == "ALBEMARLE NC US", select = -STATION)
NC[NC == -9999] <- NA
NC$STATION_NAME <- "NC"

ND <- read.csv("OH NC ND.csv", header=T)
ND <- subset(ND, ND$STATION_NAME == "MC CLUSKY ND US", select = -STATION)
ND[ND == -9999] <- NA
ND$STATION_NAME <- "ND"

OH <- read.csv("OH NC ND.csv", header=T)
OH <- subset(OH, OH$STATION_NAME == "NEWARK WATER WORKS OH US", select = -STATION)
OH[OH == -9999] <- NA
OH$STATION_NAME <- "OH"

PA <- read.csv("PA.csv", header=T)
PA <- subset(PA, PA$STATION_NAME == "STATE COLLEGE PA US", select = -STATION)
PA[PA == -9999] <- NA
PA$STATION_NAME <- "PA"

SD <- read.csv("SD.csv", header=T)
SD <- subset(SD, SD$STATION_NAME == "IPSWICH SD US", select = -STATION)
SD[SD == -9999] <- NA
SD$STATION_NAME <- "SD"

TN <- read.csv("TN.csv", header=T)
TN <- subset(TN, select = -STATION)
TN[TN == -9999] <- NA
TN$STATION_NAME <- "TN"

TX <- read.csv("TX WI.csv", header=T)
TX <- subset(TX, TX$STATION_NAME == "DAL FTW WSCMO AIRPORT TX US", select = -STATION)
TX[TX == -9999] <- NA
TX$STATION_NAME <- "TX"

WI <- read.csv("TX WI.csv", header=T)
WI <- subset(WI, WI$STATION_NAME == "MILWAUKEE MITCHELL INTERNATIONAL AIRPORT WI US", select = -STATION)
WI[WI == -9999] <- NA
WI$STATION_NAME <- "WI"


# 3.rbind all of the 18 states for climate (temp prcp) data
d <- rbind(CO, IL, IN, IA, KS, KY, MI, MN, MO, 
           NE, NC, ND, OH, PA, SD, TN, TX, WI)
d$DATE <- as.Date(as.character(d$DATE), "%Y%m%d")
d <- rename(d, c(DATE = "date"))
d <- GetYearMonth(d)
d <- rename(d, c("STATION_NAME" = "state_abbr"))
d <- d[order(d$state_abbr, d$year),]

climate <- subset(d, month %in% c(4,5,6,7,8,9,10), 
                  select = c("state_abbr", "year", "month", "TPCP", "MMXT", "MMNT", "MNTM"))
names(climate) <- c("state_abbr", "year", "month", "prcp", "tmax", "tmin", "tmean")

climate$tmax <- ConvertToCelcius(climate$tmax)
climate$tmin <- ConvertToCelcius(climate$tmin)
climate$tmean <- ConvertToCelcius(climate$tmean)
climate$prcp <- ConvertToMM(climate$prcp)

climate18 <- data.frame(subset(climate, month == 6, select = - month)[,1:2]) #create the first 2 cols for the dataframe
climate18 <- SelectYear1995_2014(climate18)

for (i in 4:10){ # From April To October
  tempmonth <- subset(climate, month == i, select = - month)
  names(tempmonth)[3:6] <- c(paste0("prcp_",i), paste0("tmax_",i), paste0("tmin_",i), paste0("tmean_",i))
  climate18 <- merge(climate18, tempmonth, by = c("state_abbr", "year"))
}


# 4.Econ Variables - Per Capita Personal Income - Independent Variable
setwd("/Users/ac/Documents/columbia/study/2015fall/G5999 Master Thesis/Thesis Writing/Data&R")
state_names <- read.csv("State Names.csv", header=T)
state_names$state <- as.character(state_names$state)
state_names$state_name <- as.character(state_names$state_name)
state_names$state_abbr <- as.character(state_names$state_abbr)
thosestates <- c("CO", "IL", "IN", "IA", "KS", "KY", "MI", "MN", "MO", 
                 "NE", "NC", "ND", "OH", "PA", "SD", "TN", "TX", "WI")

econ <- read.csv("Personal Income Summary_Personal Income, Population_Per Capita Personal Income 2.csv",header=T)
econ <- subset(econ, econ$LineCode == 3, select = -LineCode:-Description) # Select Per Capita Personal Income
econ <- merge(state_names, econ, by.x = "state_name", by.y = "GeoName")
income <- data.frame()

for (i in 1:length(thosestates)){
  x <- thosestates[i]
  tempd <- data.frame(rep(x,35), 1980:2014, t(econ[econ$state_abbr == x, 5:length(econ)]))
  names(tempd) <- c("state_abbr", "year", "income_perca")
  income <- rbind(income, tempd)
}

income <- subset(income, year >= 1995)


# 5.Corn Price and Acreage - Independent Variable
yi <- read.csv("yield production 2.csv",header=T)
unique(yi$Data.Item)
yi$State <- as.character(yi$State)
yi$Data.Item <- as.character(yi$Data.Item)
yi$Value <- as.numeric(as.character(yi$Value))

price <- subset(yi, yi$Data.Item == "CORN, GRAIN - PRICE RECEIVED, MEASURED IN $ / BU", select = c(Year, State, Value))
names(price) <- c("year","state", "price")
price <- merge(price, state_names, by = "state", all.x = T)
price <- OrderByStateYear(price)
price_last <- data.frame()
for (i in 1:length(thosestates)){
  x <- thosestates[i]
  tempu <- subset(price, state_abbr == x)
  tempu$price_last <- c(NA, tempu$price[1:(dim(tempu)-1)])
  names(tempu) <- c("state","year","price", "state_name", "state_abbr", "price_last")
  price_last <- rbind(price_last, tempu)
}
price_last18 <- SelectYear1995_2014(price_last)

acre <- subset(yi, yi$Data.Item == "CORN - ACRES PLANTED", select = c(Year, State, Value))
names(acre) <- c("year","state", "acre")
acre <- merge(acre, state_names, by = "state", all.x = T)
acre <- OrderByStateYear(acre)
acre <- SelectYear1995_2014(acre)
acre18 <- subset(acre, state_abbr %in% thosestates)


# 6.Yield/Production - Dependent Variable
grain_prod <- subset(yi, Data.Item == "CORN, GRAIN - PRODUCTION, MEASURED IN BU", select = c(Year, State, Value))
names(grain_prod) <- c("year","state", "grain_prod")
silage_prod <- subset(yi, Data.Item == "CORN, SILAGE - PRODUCTION, MEASURED IN TONS", select = c(Year, State, Value))
names(silage_prod) <- c("year","state", "silage_prod")
prod <- merge(grain_prod, silage_prod, by = c("year","state"), all=T)
prod <- merge(prod, state_names, by = c("state"), all.x = T)
prod18 <- subset(prod, state_abbr %in% thosestates)
prod18$silage_prod[is.na(prod18$silage_prod)] <- 0
prod18 <- subset(prod18, !duplicated(prod18[,1:2]))
prod18 <- OrderByStateYear(prod18)
prod18 <- SelectYear1995_2014(prod18)
prod18$tot_prod <- prod18$grain_prod + prod18$silage_prod * 39.368
# 1 bushel = .0254 metric ton
# 1 metric ton = 39.368 bushels


grain_yi <- subset(yi, yi$Data.Item == "CORN, GRAIN - YIELD, MEASURED IN BU / ACRE", select = c(Year, State, Value))
names(grain_yi) <- c("year","state", "grain_yi")
silage_yi <- subset(yi, Data.Item == "CORN, SILAGE - YIELD, MEASURED IN TONS / ACRE", select = c(Year, State, Value))
names(silage_yi) <- c("year","state", "silage_yi")
yield <- merge(grain_yi, silage_yi, by = c("year","state"), all=T)
yield <- merge(yield, state_names, by = c("state"), all.x = T)
yield18 <- subset(yield, state_abbr %in% thosestates)
yield18$silage_yi[is.na(yield18$silage_yi)] <- 0
yield18 <- subset(yield18, !duplicated(yield18[,1:2]))
yield18 <- OrderByStateYear(yield18)
yield18 <- SelectYear1995_2014(yield18)
yield18$tot_yield <- yield18$grain_yi + yield18$silage_yi * 39.368


# 7.Table combined with Production/Yield, Climate and Econ factors
y <- merge(acre18, prod18, by = c("state", "state_name", "state_abbr", "year"), all=T)
y <- merge(y, yield18, by = c("state", "state_name", "state_abbr", "year"), all=T)
y <- merge(y, income, by = c("state_abbr", "year"), all=T)
y <- merge(y, climate18, by = c("state_abbr", "year"), all=T)
y <- merge(y, price_last18, by = c("state", "state_name", "state_abbr", "year"), all=T)
write.csv(y, "entire dataset.csv", row.names = F)
# y <- read.csv("entire dataset.csv", header=T)

y$income_cat <- cut(y$income_perca, breaks = c(min(y$income_perca)-1, 26445, 30741.6, 35276.4, 40207, max(y$income_perca)+1), label = c("low", "lower_middle", "middle", "upper_middle", "high")) 
write.csv(y, "entire dataset with income_cat.csv", row.names = F)

########################################################################################################
########################################################################################################
############# One can start right from here - Run the green lines
# y <- read.csv("entire dataset with income_cat.csv", header=T)
# y[,1] <- as.character(y[,1])
# y[,2] <- as.character(y[,2])
# y[,3] <- as.character(y[,3])
########################################################################################################
########################################################################################################

# 8.Preparation for Regression - change the scale of variables
y2 <- y
y2$income_cat <- as.numeric(y2$income_cat)
y2$acre <- y2$acre/1000000
y2$grain_prod <- y2$grain_prod/1000000
y2$tot_prod <- y2$tot_prod/1000000

y3 <- subset(y2, select = -c(state, silage_prod, silage_yi, tot_prod, tot_yield))
# Choose the variables wanted and save them in a new table

write.csv(y3, "entire dataset final.csv", row.names = F)
# y3 <- read.csv("entire dataset final.csv", header=T)
# y3[,1] <- as.character(y3[,1])
# y3[,2] <- as.character(y3[,2])

for (i in c(1995:2014)){
   table <- subset(y3, year == i)
   write.csv(table, paste0("year", i,".csv"), row.names = F)
}


# variable in the datasets are: 
#  0). Dependent Varialbe - "grain_prod", "tot_prod", "grain_yi", "tot_yield"
#  1).Climate factors - many manay ... e.g: "prcp_6", "tmean_6"
#  2).Economic factors - "income_cat"
#  3).Site factors - "acre", use fixed/random effects model determined by Hausman Test
#  4).Time Trend - "year"


########################################################################################################
########################################################################################################
############# See the Data Analysis and Visualization Part in another R script
########################################################################################################
########################################################################################################
